#This script calculates cosine similarities with the opposition minus support vector
#It does this for each country and the largest version (sample size)
#It varies the number of target words used sampled for each year-week
library(quanteda)
library(conText)
library(dplyr)
library(text2vec)
library(stringr)
source("utils.R")
set.seed(123L)

# Define sample sizes and list of countries
sample_sizes <- c(2, 5, 10, 15, 20, 50, 100)
countries <- c("djazairess", "maghress", "masress", "sauress", "turess")

# Loop over each country
for (country in countries) {
  # Path to the full target tokens file (generated previously)
  tokens_file <- paste0("data/analysis_toks/", country, "_target_toks_leader.rds")
  
  # Read the full target tokens object
  full_target_toks <- readRDS(tokens_file)
  
  # Extract document-level info: assume that each element in full_target_toks represents a context,
  # and that docvars are available (e.g., yearwk)
  docs_info <- data.frame(
    doc_id = docnames(full_target_toks),
    yearwk = docvars(full_target_toks)$yearwk,
    stringsAsFactors = FALSE
  )
  
  # For each sample size, sample tokens within each group (yearwk)
  for (s in sample_sizes) {
    sampled_docs <- docs_info %>%
      group_by(yearwk) %>%
      group_modify(~ slice_sample(.x, n = min(s, nrow(.x)))) %>%
      ungroup()
    
    # Subset the tokens object to include only the sampled documents/contexts
    sampled_toks <- full_target_toks[sampled_docs$doc_id]
    
    # Save the sampled tokens object with an informative name
    output_path <- paste0("data/analysis_toks/", country, "_target_toks_leader_sample", s, ".rds")
    saveRDS(sampled_toks, file = output_path)
    cat("Saved sample tokens for", country, "sample size", s, "to", output_path, "\n")
  }
}

# Define the list of countries and other parameters
first_ar <- "المعارضة"
second_ar <- "الدعم"

# Get cosine similarities with the new tokens approach
process_cos_sim <- function(country_name, local_glove, local_transform, sample_size) {
  # Instead of reading and sampling the full corpus, build the appropriate tokens file path:
  toks_file <- paste0("data/analysis_toks/", country_name,
                      "_target_toks_leader_sample", sample_size, ".rds")
  
  cat("Getting cos_sims for country", country_name, "with sample size", sample_size, "\n")
  
  # Call the modified similarity function that reads the target tokens from a file
  cos_simsdf_all <- get_similarity_scores2(
    target_toks_file = toks_file,
    target = "TARGETWORD",
    first_vec = first_ar, 
    second_vec = second_ar, 
    pre_trained = local_glove,
    transform_matrix = local_transform,
    group_var = "yearwk",
    window = 12L,
    norm = "l2"
  )
  
  return(cos_simsdf_all)
}

process_sample_size <- function(sample_size) {
  # Read the combined embedding for the specific version (example file names)
  local_transform <- readRDS("data/embedding_combined/combined_local_transform150000030k.rds")
  local_glove <- readRDS("data/embedding_combined/combined_local_glove150000030k.rds")
  
  # Loop over countries and compute cosine similarities using the pre-sampled tokens
  results <- lapply(countries, function(country) {
    process_cos_sim(country, local_glove, local_transform, sample_size)
  })
  
  # Save results for each country
  for (i in seq_along(countries)) {
    dir_name <- paste0("data/output/cos_sims/", countries[i])
    if (!dir.exists(dir_name)) {
      dir.create(dir_name, recursive = TRUE)
    }
    saveRDS(results[[i]],
            paste0(dir_name, "/cos_simsdf_all", sample_size, "targwords.rds"))
  }
}

# Loop over sample sizes and run the similarity processing for each
lapply(sample_sizes, process_sample_size)
